Se realizan test estadísticos sobre los resultados en train de los modelos y posteriormente se aplica el algoritmo de decisión multicriterio
import pandas as pd
from pandas import read_csv
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
import joblib as joblib
import matplotlib.pyplot as plt
import scikit_posthocs as scp
import scipy.stats as stats
import operator
import numpy as np
import statsmodels as st
import pingouin as pg
import math
import sklearn.metrics as sm
import plotly.express as px
from flask import Flask
import flask
from plotly.subplots import make_subplots
import plotly.graph_objects as go
En esta primera sección visualizamos de forma gráfica y numérica los errores cometidos en las predicciones tanto en training (en cada uno de los pliegues) como en test en la predicción t. De esta forma podremos apoyarnos en los resultados para detectar modelos que pueden estar haciendo overfitting
RMSE_errores_train = read_csv('../Datos_preprocesados/RMSE_errores_train.csv', encoding ='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
RMSE_errores_train.drop(columns = "Pliegues", inplace = True)
RMSE_errores_train
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.010250 | 0.012253 | 0.040040 | 0.021895 | 0.010777 | 0.012175 | 0.015468 | 0.017608 | 0.009779 | 0.011421 | 0.009565 | 0.011302 |
| 1 | 0.003465 | 0.005035 | 0.006011 | 0.008859 | 0.005831 | 0.004163 | 0.006113 | 0.004916 | 0.003236 | 0.005702 | 0.005437 | 0.004665 |
| 2 | 0.002359 | 0.002950 | 0.002599 | 0.004666 | 0.002282 | 0.002253 | 0.007964 | 0.006088 | 0.002058 | 0.001663 | 0.002220 | 0.002200 |
| 3 | 0.005817 | 0.005213 | 0.008027 | 0.008158 | 0.006893 | 0.007117 | 0.012659 | 0.018517 | 0.005418 | 0.005386 | 0.006962 | 0.007347 |
| 4 | 0.019228 | 0.018612 | 0.015113 | 0.017661 | 0.014720 | 0.014294 | 0.018877 | 0.022049 | 0.018109 | 0.019953 | 0.015303 | 0.015526 |
| 5 | 0.097717 | 0.099564 | 0.030118 | 0.027917 | 0.031447 | 0.029454 | 0.031626 | 0.026445 | 0.094637 | 0.099538 | 0.031492 | 0.028937 |
| 6 | 0.054043 | 0.054168 | 0.047621 | 0.046455 | 0.048174 | 0.044944 | 0.050317 | 0.046399 | 0.049422 | 0.054665 | 0.048253 | 0.045141 |
| 7 | 0.117732 | 0.113723 | 0.064359 | 0.061369 | 0.063967 | 0.061773 | 0.066249 | 0.059950 | 0.118328 | 0.118405 | 0.064465 | 0.062454 |
| 8 | 0.124888 | 0.141807 | 0.085793 | 0.091884 | 0.085716 | 0.091056 | 0.089009 | 0.093796 | 0.121939 | 0.142605 | 0.085260 | 0.091053 |
| 9 | 0.060135 | 0.064386 | 0.029916 | 0.038327 | 0.029938 | 0.036199 | 0.029868 | 0.036866 | 0.048429 | 0.046525 | 0.029645 | 0.035670 |
RMSE_errores_train.describe()
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
| mean | 0.049563 | 0.051771 | 0.032960 | 0.032719 | 0.029975 | 0.030343 | 0.032815 | 0.033264 | 0.047135 | 0.050586 | 0.029860 | 0.030430 |
| std | 0.048881 | 0.051414 | 0.027178 | 0.027624 | 0.028047 | 0.028913 | 0.027536 | 0.027363 | 0.048149 | 0.052089 | 0.028119 | 0.028922 |
| min | 0.002359 | 0.002950 | 0.002599 | 0.004666 | 0.002282 | 0.002253 | 0.006113 | 0.004916 | 0.002058 | 0.001663 | 0.002220 | 0.002200 |
| 25% | 0.006925 | 0.006973 | 0.009799 | 0.011060 | 0.007864 | 0.008381 | 0.013362 | 0.017836 | 0.006509 | 0.007132 | 0.007613 | 0.008336 |
| 50% | 0.036635 | 0.036390 | 0.030017 | 0.024906 | 0.022329 | 0.021874 | 0.024372 | 0.024247 | 0.033269 | 0.033239 | 0.022474 | 0.022231 |
| 75% | 0.088322 | 0.090770 | 0.045726 | 0.044423 | 0.043993 | 0.042758 | 0.045644 | 0.044016 | 0.083333 | 0.088320 | 0.044063 | 0.042774 |
| max | 0.124888 | 0.141807 | 0.085793 | 0.091884 | 0.085716 | 0.091056 | 0.089009 | 0.093796 | 0.121939 | 0.142605 | 0.085260 | 0.091053 |
fig = px.box(RMSE_errores_train, y = RMSE_errores_train.columns, width = 700, height = 400)
fig.show()
MAE_errores_train = read_csv('../Datos_preprocesados/MAE_errores_train.csv', encoding ='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
MAE_errores_train.drop(columns = "Pliegues", inplace = True)
MAE_errores_train
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.008462 | 0.011392 | 0.032479 | 0.014992 | 0.007227 | 0.007082 | 0.012623 | 0.017494 | 0.007774 | 0.009879 | 0.006476 | 0.005981 |
| 1 | 0.003400 | 0.005021 | 0.004186 | 0.003734 | 0.004024 | 0.002734 | 0.005366 | 0.003984 | 0.003021 | 0.005115 | 0.003658 | 0.003443 |
| 2 | 0.001714 | 0.002173 | 0.001866 | 0.003266 | 0.001329 | 0.001685 | 0.005990 | 0.004277 | 0.001440 | 0.001243 | 0.001306 | 0.001632 |
| 3 | 0.002247 | 0.002934 | 0.003994 | 0.002763 | 0.002256 | 0.002159 | 0.007597 | 0.010134 | 0.001828 | 0.001652 | 0.001959 | 0.002001 |
| 4 | 0.012167 | 0.013229 | 0.011721 | 0.007150 | 0.011192 | 0.012083 | 0.014557 | 0.018390 | 0.012484 | 0.013561 | 0.012501 | 0.012333 |
| 5 | 0.084448 | 0.087158 | 0.016943 | 0.016096 | 0.017917 | 0.016959 | 0.021310 | 0.012787 | 0.082231 | 0.087985 | 0.020494 | 0.016918 |
| 6 | 0.033824 | 0.033456 | 0.026278 | 0.026127 | 0.025823 | 0.027261 | 0.034723 | 0.028557 | 0.022534 | 0.032382 | 0.026455 | 0.025546 |
| 7 | 0.065309 | 0.056947 | 0.036142 | 0.032730 | 0.035894 | 0.033170 | 0.040139 | 0.031767 | 0.074404 | 0.061051 | 0.037323 | 0.032217 |
| 8 | 0.098532 | 0.118379 | 0.052798 | 0.042889 | 0.055291 | 0.043395 | 0.059637 | 0.043899 | 0.099885 | 0.115925 | 0.054401 | 0.042336 |
| 9 | 0.043867 | 0.055397 | 0.020134 | 0.024806 | 0.020085 | 0.023185 | 0.018272 | 0.020647 | 0.030652 | 0.040988 | 0.019380 | 0.022019 |
MAE_errores_train.describe()
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
| mean | 0.035397 | 0.038609 | 0.020654 | 0.017455 | 0.018104 | 0.016971 | 0.022021 | 0.019194 | 0.033625 | 0.036978 | 0.018395 | 0.016443 |
| std | 0.036337 | 0.040109 | 0.016466 | 0.013860 | 0.017191 | 0.014463 | 0.017656 | 0.012644 | 0.037485 | 0.039825 | 0.017260 | 0.013971 |
| min | 0.001714 | 0.002173 | 0.001866 | 0.002763 | 0.001329 | 0.001685 | 0.005366 | 0.003984 | 0.001440 | 0.001243 | 0.001306 | 0.001632 |
| 25% | 0.004666 | 0.006614 | 0.006070 | 0.004588 | 0.004825 | 0.003821 | 0.008853 | 0.010797 | 0.004210 | 0.006306 | 0.004363 | 0.004077 |
| 50% | 0.022996 | 0.023343 | 0.018539 | 0.015544 | 0.014555 | 0.014521 | 0.016415 | 0.017942 | 0.017509 | 0.022971 | 0.015940 | 0.014626 |
| 75% | 0.059948 | 0.056559 | 0.030929 | 0.025797 | 0.024388 | 0.026242 | 0.031370 | 0.026579 | 0.063466 | 0.056035 | 0.024965 | 0.024665 |
| max | 0.098532 | 0.118379 | 0.052798 | 0.042889 | 0.055291 | 0.043395 | 0.059637 | 0.043899 | 0.099885 | 0.115925 | 0.054401 | 0.042336 |
fig = px.box(MAE_errores_train, y = MAE_errores_train.columns, width = 700, height = 400)
fig.show()
CC_train = read_csv('../Datos_preprocesados/CC_train.csv', encoding ='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
CC_train.drop(columns = "Pliegues", inplace = True)
CC_train
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.660 | 0.851 | 0.396 | 0.465 | 0.431 | 0.433 | 0.485 | 0.465 | 0.592 | 0.875 | 0.458 | 0.450 |
| 1 | 0.725 | 0.647 | 0.065 | -0.042 | 0.160 | 0.047 | 0.245 | -0.009 | 0.269 | 0.734 | 0.202 | 0.109 |
| 2 | 0.866 | 0.907 | 0.467 | 0.556 | 0.486 | 0.571 | 0.491 | 0.553 | 0.646 | 0.895 | 0.497 | 0.571 |
| 3 | 0.926 | 0.934 | 0.337 | 0.433 | 0.320 | 0.445 | 0.336 | 0.535 | 0.871 | 0.942 | 0.315 | 0.442 |
| 4 | 0.863 | 0.889 | 0.568 | 0.601 | 0.561 | 0.602 | 0.689 | 0.748 | 0.786 | 0.981 | 0.571 | 0.598 |
| 5 | 0.928 | 0.932 | 0.793 | 0.803 | 0.796 | 0.803 | 0.824 | 0.826 | 0.938 | 0.985 | 0.796 | 0.802 |
| 6 | 0.837 | 0.910 | 0.517 | 0.583 | 0.508 | 0.600 | 0.477 | 0.690 | 0.909 | 0.976 | 0.501 | 0.598 |
| 7 | 0.943 | 0.944 | 0.888 | 0.913 | 0.887 | 0.910 | 0.860 | 0.873 | 0.966 | 0.981 | 0.884 | 0.907 |
| 8 | 0.933 | 0.948 | 0.864 | 0.885 | 0.862 | 0.879 | 0.817 | 0.811 | 0.989 | 0.981 | 0.861 | 0.879 |
| 9 | 0.804 | 0.782 | 0.620 | 0.586 | 0.609 | 0.591 | 0.621 | 0.638 | 0.844 | 0.968 | 0.608 | 0.587 |
CC_train.describe()
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
| mean | 0.848500 | 0.874400 | 0.551500 | 0.578300 | 0.562000 | 0.588100 | 0.584500 | 0.613000 | 0.781000 | 0.931800 | 0.569300 | 0.594300 |
| std | 0.095342 | 0.094558 | 0.255903 | 0.274144 | 0.235375 | 0.252187 | 0.212598 | 0.257212 | 0.222787 | 0.079698 | 0.225801 | 0.236204 |
| min | 0.660000 | 0.647000 | 0.065000 | -0.042000 | 0.160000 | 0.047000 | 0.245000 | -0.009000 | 0.269000 | 0.734000 | 0.202000 | 0.109000 |
| 25% | 0.812250 | 0.860500 | 0.413750 | 0.487750 | 0.444750 | 0.476500 | 0.479000 | 0.539500 | 0.681000 | 0.906750 | 0.467750 | 0.480250 |
| 50% | 0.864500 | 0.908500 | 0.542500 | 0.584500 | 0.534500 | 0.595500 | 0.556000 | 0.664000 | 0.857500 | 0.972000 | 0.536000 | 0.592500 |
| 75% | 0.927500 | 0.933500 | 0.749750 | 0.752500 | 0.749250 | 0.752750 | 0.785000 | 0.795250 | 0.930750 | 0.981000 | 0.749000 | 0.751000 |
| max | 0.943000 | 0.948000 | 0.888000 | 0.913000 | 0.887000 | 0.910000 | 0.860000 | 0.873000 | 0.989000 | 0.985000 | 0.884000 | 0.907000 |
fig = px.box(CC_train, y = CC_train.columns, width = 700, height = 400)
fig.show()
Como hemos comentado, nos apoyaremos en las gráficas y en aquellos modelos para los que haya una diferencia de 0.20 o más entre las predicciones de train y test, que consideraremos que están haciendo overfitting
df_predictor_lag3_escalado = read_csv('../Datos_preprocesados/predictor_lag3_escalado.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
df_predictor_lag3_escalado = df_predictor_lag3_escalado.set_index('Fecha')
df_predictor_lag5_escalado = read_csv('../Datos_preprocesados/predictor_lag5_escalado.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
df_predictor_lag5_escalado = df_predictor_lag5_escalado.set_index('Fecha')
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(df_predictor_lag3_escalado.drop(['Incidentes'], axis=1),
df_predictor_lag3_escalado['Incidentes'], train_size = 0.8, test_size = 0.2,
random_state = 42, shuffle = False)
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(df_predictor_lag5_escalado.drop(['Incidentes'], axis=1),
df_predictor_lag5_escalado['Incidentes'], train_size = 0.8, test_size = 0.2,
random_state = 42, shuffle = False)
modelos = []
modelos.append(joblib.load('rf_lag3_escalado.pkl'))
modelos.append(joblib.load('rf_lag5_escalado.pkl'))
modelos.append(joblib.load('regresion_lineal_lag3_escalado.pkl'))
modelos.append(joblib.load('regresion_lineal_lag5_escalado.pkl'))
modelos.append(joblib.load('lasso_lag3_escalado.pkl'))
modelos.append(joblib.load('lasso_lag5_escalado.pkl'))
modelos.append(joblib.load('svr_lag3_escalado.pkl'))
modelos.append(joblib.load('svr_lag5_escalado.pkl'))
modelos.append(joblib.load('gbr_lag3_escalado.pkl'))
modelos.append(joblib.load('gbr_lag5_escalado.pkl'))
modelos.append(joblib.load('en_lag3_escalado.pkl'))
modelos.append(joblib.load('en_lag5_escalado.pkl'))
nombres = ['RF Lag3', 'RF Lag5', 'LR Lag3', 'LR Lag5', 'Lasso Lag3', 'Lasso Lag5', 'SVR Lag3', 'SVR Lag5', 'GBR Lag3', 'GBR Lag5', 'EN Lag3', 'EN Lag5']
def pred_train_test(modelo, lag):
if lag == 3:
X_train = X_train_3
y_train = y_train_3
X_test = X_test_3
y_test = y_test_3
if lag == 5:
X_train = X_train_5
y_train = y_train_5
X_test = X_test_5
y_test = y_test_5
fig = make_subplots(rows=1, cols=2)
# Predicción en train
y_pred_train_modelo = modelo.predict(X_train)
# Predicción en test
y_pred_test_modelo = modelo.predict(X_test)
fig.add_trace(go.Scatter(x = y_train.index, y = y_pred_train_modelo, mode = 'lines', name = 'Train Real'), row = 1, col = 1)
fig.add_trace(go.Scatter(x = y_train.index, y = y_train, mode = 'lines', name = 'Pred Train'), row = 1, col = 1)
fig.add_trace(go.Scatter(x = y_test.index, y = y_pred_test_modelo, mode = 'lines', name = 'Test Real'), row = 1, col = 2)
fig.add_trace(go.Scatter(x = y_test.index, y = y_test, mode = 'lines', name = 'Pred test'), row = 1, col = 2)
fig.update_layout(showlegend = True, title_text = 'Predicciones Train VS Test ' + str(nombres[i]))
flask.Markup(fig)
fig.show("notebook")
mae_train = sm.mean_absolute_error(y_train, y_pred_train_modelo)
mae_test = sm.mean_absolute_error(y_test, y_pred_test_modelo)
rmse_train = math.sqrt(sm.mean_squared_error(y_train, y_pred_train_modelo))
rmse_test = math.sqrt(sm.mean_squared_error(y_test, y_pred_test_modelo))
return mae_train, mae_test, rmse_train, rmse_test
maes_train = []
maes_test = []
rmses_train = []
rmses_test = []
for i in range(0, len(modelos)):
nombre = nombres[i]
# Si es numero par
if i % 2 == 0:
lag = 3
else:
lag = 5
mae_train, mae_test, rmse_train, rmse_test = pred_train_test(modelos[i], lag)
maes_train.append(mae_train)
maes_test.append(mae_test)
rmses_train.append(rmse_train)
rmses_test.append(rmse_test)